Purpose:
- annotate cell types and define expanded clonotypes (B cells and T
cells)
- QC filtering, Doublet removal and automated cell type annotation was
performed in 0_preprocessing.Rmd
- overlap single cell RNA-Seq and VDJ data on proceed with analyzing
overlapping cells
Prior to running this code:
- Before starting set working directory to the
CompImmunologyWorkshopUCSF folder
- setwd(“[PATH]/CompImmunologyWorkshopUCSF”)
# Set Working Directory before starting
# Immcantation
suppressPackageStartupMessages(library(alakazam))
suppressPackageStartupMessages(library(shazam))
suppressPackageStartupMessages(library(tigger))
suppressPackageStartupMessages(library(dowser))
suppressPackageStartupMessages(library(airr))
suppressPackageStartupMessages(library(glmGamPoi))
suppressPackageStartupMessages(library(Seurat))
suppressPackageStartupMessages(library(dittoSeq))
suppressPackageStartupMessages(library(reshape2))
suppressPackageStartupMessages(library(kableExtra))
suppressPackageStartupMessages(library(devtools))
suppressPackageStartupMessages(library(data.table))
suppressPackageStartupMessages(library(ggplot2))
suppressPackageStartupMessages(library(RColorBrewer))
suppressPackageStartupMessages(library(gridExtra))
suppressPackageStartupMessages(library(cowplot))
suppressPackageStartupMessages(library(dplyr))
suppressPackageStartupMessages(library(impactSingleCellToolkit))
# Immcantation Results
bcr_heavy_fh <- "data/results_immcantation/BCR_CSFPB_heavy_germ-pass.tsv"
bcr_light_fh <- "data/results_immcantation/BCR_CSFPB_light_germ-pass.tsv"
tcr_beta_fh <- "data/results_immcantation/TCR_CSFPB_heavy_germ-pass.tsv"
tcr_alpha_fh <- "data/results_immcantation/TCR_CSFPB_light_germ-pass.tsv"
# seed number
seed.number <- 12345
# Workshop specific functions
source("resources/functions_workshop.R")
1. Load RNA-Seq object
- At this stage of the analysis scRNA-Seq, BCR and TCR repertoire data
have been processed and overlapped
- Doublet were removed computationally using ‘DoubletFinder’
- cells with more than 10% mitochondrial gene expression were
omitted
- Only B cells and T cells exist in this object
rnaseq_obj_full <- readRDS("objects/seurat.obj.processed.rds")
3. Only analyze cells shared across all datasets
a. get vector of overlapping cells
cells_rnaseq <- colnames(rnaseq_obj_full)
cells_vdj <- unique(c(bcr_df$unique_cell_id,
tcr_df$unique_cell_id))
overlapping_cells <- cells_rnaseq[cells_rnaseq %in% cells_vdj]
b. filter rna-seq and vdj data
- Even after filtering for cells that overlap with BCR/TCR data, there
are still sparse clusters of cells annotated as different cell
types
# only keep cells with vdj
rnaseq_obj <- rnaseq_obj_full[,colnames(rnaseq_obj_full) %in% overlapping_cells]
# add column to show which cells overlap with scRNA-Seq
bcr_df$cell_overlap_status <- "no"
tcr_df$cell_overlap_status <- "no"
bcr_df[bcr_df$unique_cell_id %in% overlapping_cells,"cell_overlap_status"] <- "yes"
tcr_df[tcr_df$unique_cell_id %in% overlapping_cells,"cell_overlap_status"] <- "yes"
4. Plot cell before and after VDJ filter
a. SingleR annotations pre and post VDJ filter
# generate UMAP of cell type annotations pre-filtering
p1_pre_filter_singler <- DimPlot(object = rnaseq_obj_full, group.by="main_bencode", pt.size=1.0, label.size = 5, alpha = 0.7,label = T,reduction = "umap")
# post filter umap
p2_post_filter_singler <- DimPlot(object = rnaseq_obj, group.by="main_bencode", pt.size=1.0, label.size = 5, alpha = 0.7,label = T,reduction = "umap")
plot_grid(p1_pre_filter_singler,p2_post_filter_singler,ncol = 2)

b. Azimuth annotations pre and post VDJ filter
# generate UMAP of cell type annotations pre-filtering
p1_pre_filter_azimuth <- DimPlot(object = rnaseq_obj_full, group.by="predicted.celltype.l1", pt.size=1.0, label.size = 5, alpha = 0.7,label = T,reduction = "umap")
# post filter umap
p2_post_filter_azimuth <- DimPlot(object = rnaseq_obj, group.by="predicted.celltype.l1", pt.size=1.0, label.size = 5, alpha = 0.7,label = T,reduction = "umap")
plot_grid(p1_pre_filter_azimuth,p2_post_filter_azimuth,ncol = 2)

5. Add columns to VDJ data
- This code block gets stuck when running. To fix this press
return in the R console
# Add sample column
tcr_df$sample <- tstrsplit(tcr_df$unique_cell_id,"-")[[2]]
bcr_df$sample <- tstrsplit(bcr_df$unique_cell_id,"-")[[2]]
# Add column for CSF/PB
tcr_df$COMPARTMENT <- tstrsplit(tcr_df$sample,"_")[[2]]
bcr_df$COMPARTMENT <- tstrsplit(bcr_df$sample,"_")[[2]]
tcr_df$COMPARTMENT[tcr_df$COMPARTMENT %in% "PBMC"] <- "PB"
bcr_df$COMPARTMENT[bcr_df$COMPARTMENT %in% "PBMC"] <- "PB"
# Add Status Disease/Healthy
tcr_df$STATUS <- tstrsplit(tcr_df$sample,"_")[[1]] %>% as.character()
tcr_df$STATUS[tcr_df$STATUS %in% "32"] <- "Healthy"
tcr_df$STATUS[tcr_df$STATUS %in% "28"] <- "Disease"
bcr_df$STATUS <- tstrsplit(bcr_df$sample,"_")[[1]] %>% as.character()
bcr_df$STATUS[bcr_df$STATUS %in% "32"] <- "Healthy"
bcr_df$STATUS[bcr_df$STATUS %in% "28"] <- "Disease"
6. Re-cluster
- This line should only be run once
set.seed(seed.number)
rnaseq_obj <- scaleAndClusterSeuratObject(rnaseq_obj,dims = 1:6,npca = 5,resolution = 0.3,dim.reduction = T)
## Warning: The `slot` argument of `SetAssayData()` is deprecated as of SeuratObject 5.0.0.
## ℹ Please use the `layer` argument instead.
## ℹ The deprecated feature was likely used in the Seurat package.
## Please report the issue at <https://github.com/satijalab/seurat/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: The `slot` argument of `GetAssayData()` is deprecated as of SeuratObject 5.0.0.
## ℹ Please use the `layer` argument instead.
## ℹ The deprecated feature was likely used in the Seurat package.
## Please report the issue at <https://github.com/satijalab/seurat/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: Different cells and/or features from existing assay SCT
## Computing nearest neighbor graph
## Computing SNN
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 5291
## Number of edges: 160470
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8992
## Number of communities: 10
## Elapsed time: 0 seconds
## Warning in LayerData.Assay5(object = x, layer = i): multiple layers are identified by counts.1 counts.2 counts.3 counts.4
## only the first layer is used
## Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
## To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
## This message will be shown once per session
7. Plot cell type annotations after re-clustering
a. SingleR
p3_reclustered_singler <- DimPlot(object = rnaseq_obj, group.by="main_bencode", pt.size=1.0, label.size = 5,label = T,reduction = "umap")
p4_reclustered_singler <- DimPlot(object = rnaseq_obj, group.by="fine_bencode", pt.size=1.0,label.size = 5, label = T,reduction = "umap")
plot_grid(p3_reclustered_singler,p4_reclustered_singler,ncol = 2)

b. Azimuth
p3_reclustered_azimuth <- DimPlot(object = rnaseq_obj, group.by="predicted.celltype.l1", pt.size=1.0, label.size = 5,label = T,reduction = "umap")
p4_reclustered_azimuth <- DimPlot(object = rnaseq_obj, group.by="predicted.celltype.l2", pt.size=1.0,label.size = 5, label = T,reduction = "umap")
plot_grid(p3_reclustered_azimuth,p4_reclustered_azimuth,ncol = 2)

—————
Manual cell type annotation
1. Look at the data
a. Add columns and look at UMAP plots
# Add compartment column
rnaseq_obj$compartment <- tstrsplit(rnaseq_obj$sample,"_")[[2]]
rnaseq_obj$compartment <- str_replace_all(rnaseq_obj$compartment,"PBMC","PB")
# Add patient status column
rnaseq_obj$status <- tstrsplit(rnaseq_obj$sample,"_")[[1]]
rnaseq_obj$status[rnaseq_obj$status %in% "32"] <- "Healthy"
rnaseq_obj$status[rnaseq_obj$status %in% "28"] <- "Disease"
# loc
loc_cols = c("CSF" = "blue", "PB" = "red")
diagnosis_cols = c("Disease" = "purple","Healthy" = "orange")
p1 <- DimPlot(object = rnaseq_obj, group.by="seurat_clusters", pt.size=1.0, label.size = 10, alpha = 0.7,label = T,reduction = "umap")
p2 <- DimPlot(object = rnaseq_obj, group.by="compartment", pt.size=1.0, label.size = 10, alpha = 0.7,label = F,cols = loc_cols,reduction = "umap")
p3 <- DimPlot(object = rnaseq_obj, group.by="status", pt.size=1.0, label.size = 10,alpha = 0.7,label = F,cols = diagnosis_cols,reduction = "umap")
plot_grid(p1,p2,p3,ncol = 3)

b. feature/violin of canonical genes
- For an extended list of gene markers for cell types, refer to
Celltype_Markers.xlsx in the resources
directory
- PPBP = Platelets, which are noise
tcell_panel <- c("CD3E","CD3D","CD3G","CD4","CD8A","CD8B")
bcell_panel <- c("MS4A1","CD79A","CD79B","CD19","IGHM","IGHA1")
other_celltypes_panel <- c("CD14","LYZ","MS4A7","FCER1A","APOE","PPBP")
B cell gene panel
FeaturePlot(rnaseq_obj,features = bcell_panel,ncol = 3)

T cell gene panel
FeaturePlot(rnaseq_obj,features = tcell_panel,ncol = 3)

gene panel for other cell types
- We should not see an defined clusters for other cells types since
these data should only contain B cells and T cells
- CD14+ Monocytes - LYZ, CD14
- CD16+ Monocytes - MS4A7
- Monocyte derived Dendritic Cells - FCGR3A,MS4A7
- Macrophage - APOE
- Platelet - PPBP
FeaturePlot(rnaseq_obj,features = other_celltypes_panel,ncol = 3)

2. Add main cell types
- While cell types can be annotated manually based on gene expression,
The 10X VDJ results tell us with higher confidence which cells are B
cells/T cells
main_annot <- names(rnaseq_obj$orig.ident)
main_annot[main_annot %in% bcr_df$unique_cell_id] <- "B cells"
main_annot[main_annot %in% tcr_df$unique_cell_id] <- "T cells"
names(main_annot) <- names(rnaseq_obj$orig.ident)
rnaseq_obj$main.celltype.annot <- main_annot
p4 <- DimPlot(object = rnaseq_obj, group.by="seurat_clusters", pt.size=1.0, label.size = 5, alpha = 0.7,label = T,reduction = "umap")
p5 <- DimPlot(object = rnaseq_obj, group.by="main.celltype.annot", pt.size=1.0, label.size = 5, alpha = 0.7,label = T,reduction = "umap")
plot_grid(p4,p5, ncol = 2)

3. Add in Subtype annotations
a. Violin of CD4/CD8 genes
subtype_annotations <- colnames(rnaseq_obj)
Idents(rnaseq_obj) <- rnaseq_obj$seurat_clusters
p6 <- VlnPlot(rnaseq_obj,features = c("CD4","CD8A","CD8B"),pt.size = 0,ncol = 1)
## Warning: `PackageCheck()` was deprecated in SeuratObject 5.0.0.
## ℹ Please use `rlang::check_installed()` instead.
## ℹ The deprecated feature was likely used in the Seurat package.
## Please report the issue at <https://github.com/satijalab/seurat/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
plot_grid(p6,p4,ncol = 2,rel_heights = c(8,6),rel_widths = c(6,6))

b. Define T cell subtypes based on CD8 gene expression & SingleR
annotations
- CD8 T cells are much higher in automated SingleR annotations
- In addition, there are cell types which are mis-annotated as
Macrophages, Monocytes and NK cells
- Mis-annotated celltype cluster with B cells
- We are going to re-annotate T cells manually based on CD8
expression
FeaturePlot(rnaseq_obj, features = c("CD8A", "CD8B"),blend = TRUE)

# SingleR and Azimuth annotatons
celltype_annot_df <- rnaseq_obj@meta.data[,c("sample","seurat_clusters","main_bencode","fine_bencode","main.celltype.annot","predicted.celltype.l1","predicted.celltype.l2")]
# Manual T cell annotation based on CD8 expression
CD8_pos_tcells <- colnames(subset(rnaseq_obj, subset = CD8A > 0 | CD8B > 0))
celltype_annot_df$cd8_pos <- row.names(celltype_annot_df)
celltype_annot_df$cd8_pos[celltype_annot_df$cd8_pos %in% CD8_pos_tcells] <- "yes"
celltype_annot_df$cd8_pos[! celltype_annot_df$cd8_pos %in% "yes"] <- "no"
tcell_subtype_annotation <- names(rnaseq_obj$main.celltype.annot[rnaseq_obj$main.celltype.annot %in% "T cells"])
CD8_pos_tcells <- tcell_subtype_annotation[tcell_subtype_annotation %in% CD8_pos_tcells]
CD4_tcells <- tcell_subtype_annotation[! tcell_subtype_annotation %in% CD8_pos_tcells]
celltype_annot_df$sub.celltype.annot <- celltype_annot_df$main.celltype.annot
celltype_annot_df[row.names(celltype_annot_df) %in% CD8_pos_tcells, "sub.celltype.annot"] <- "CD8 T cells"
celltype_annot_df[row.names(celltype_annot_df) %in% CD4_tcells, "sub.celltype.annot"] <- "CD4 T cells"
# Stacked bar
# - Not all Celltype predictions for CD8+ T cells express CD8
# SingleR
summary_main_bencode <- celltype_annot_df %>% count(seurat_clusters, main_bencode, name = "count")
levels(summary_main_bencode$main_bencode) <- sort(unique(summary_main_bencode$main_bencode))
colors_main_bencode <- c("red","darkgreen","darkblue","purple","orange","grey")
p7 <- ggplot(summary_main_bencode, aes(x = seurat_clusters, y = count, fill = main_bencode)) + geom_bar(stat = "identity",position = "fill") + scale_fill_manual(values = colors_main_bencode) + ggtitle("SingleR Cell-type Predictions")
# Azimuth
summary_azimuth <- celltype_annot_df %>% count(seurat_clusters, predicted.celltype.l1, name = "count")
levels(summary_azimuth$predicted.celltype.l1) <- sort(unique(summary_azimuth$predicted.celltype.l1))
colors_azimuth <- c("red","darkgreen","darkblue","purple","orange","brown","black","grey")
p8 <- ggplot(summary_azimuth, aes(x = seurat_clusters, y = count, fill = predicted.celltype.l1)) + geom_bar(stat = "identity",position = "fill") + scale_fill_manual(values = colors_azimuth) + ggtitle("Azimuth Cell-type Predictions")
# Annotations based on BCR and TCR VDJ data
summary_subtype <- celltype_annot_df %>% count(seurat_clusters, sub.celltype.annot, name = "count")
levels(summary_subtype$sub.celltype.annot) <- sort(unique(summary_subtype$sub.celltype.annot))
colors_subtype <- c("red","darkgreen","darkblue")
p9 <- ggplot(summary_subtype, aes(x = seurat_clusters, y = count, fill = sub.celltype.annot)) + geom_bar(stat = "identity",position = "fill") + scale_fill_manual(values = colors_subtype) + ggtitle("Multiomics")
plot_grid(p7,p8,p9,ncol = 1,align = "v")

c. Define B cell subtypes based on Azimuth subtype annotations
celltype_annot_df_bcr <- celltype_annot_df[celltype_annot_df$main.celltype.annot %in% "B cells",]
uni_bcell_subtypes <- unique(celltype_annot_df$predicted.celltype.l2)
non_bcell_subtypes <- uni_bcell_subtypes[! uni_bcell_subtypes %in% c("B naive","B memory","B intermediate","Plasmablast")]
# For simplicity we will annotate non-B cell annotations as Naive B cells
# - Violin plots showed that these cells have virtually no IgG, IgA or V gene expression
celltype_annot_df_bcr$predicted.celltype.l2[celltype_annot_df_bcr$predicted.celltype.l2 %in% non_bcell_subtypes] <- "B naive"
# Add B cell subtype annotations into sub.celltype.annot column
for (bsubtype in unique(celltype_annot_df_bcr$predicted.celltype.l2)) {
subset_cells <- row.names(celltype_annot_df_bcr[celltype_annot_df_bcr$predicted.celltype.l2 %in% bsubtype,])
celltype_annot_df[row.names(celltype_annot_df) %in% subset_cells,"sub.celltype.annot"] <- bsubtype
}
# Add subtype annotations back into RNA-Seq object
rnaseq_obj$sub.celltype.annot <- celltype_annot_df$sub.celltype.annot
# Plot
p10 <- dittoDimPlot(rnaseq_obj, "main.celltype.annot")
p11 <- dittoDimPlot(rnaseq_obj, "sub.celltype.annot")
grid.arrange(p10,p11,nrow=1)

4. Add cell type annotation to VDJ results
# TCR
tcell_annotation_col <- rep("No Annot",nrow(tcr_df))
names(tcell_annotation_col) <- tcr_df$unique_cell_id
subtype_annotations_tcr <- celltype_annot_df[celltype_annot_df$main.celltype.annot %in% "T cells",]
for (subtype in unique(subtype_annotations_tcr$sub.celltype.annot)) {
subtype_cells <- row.names(subtype_annotations_tcr[subtype_annotations_tcr$sub.celltype.annot %in% subtype,])
tcell_annotation_col[names(tcell_annotation_col) %in% subtype_cells] <- subtype
}
tcr_df$SUBTYPE_ANNOTATION <- tcell_annotation_col
# BCR
bcell_annotation_col<- rep("No Annot",nrow(bcr_df))
names(bcell_annotation_col) <- bcr_df$unique_cell_id
subtype_annotations_bcr <- celltype_annot_df[celltype_annot_df$main.celltype.annot %in% "B cells",]
for (subtype in unique(subtype_annotations_bcr$sub.celltype.annot)) {
subtype_cells <- row.names(subtype_annotations_bcr[subtype_annotations_bcr$sub.celltype.annot %in% subtype,])
bcell_annotation_col[names(bcell_annotation_col) %in% subtype_cells] <- subtype
}
bcr_df$SUBTYPE_ANNOTATION <- bcell_annotation_col
5. define expanded clonotypes
- createCloneCountTable & addExpansionCategories are in
functions_workshop.R
a. generate clone count data frames
- The clone counts are calculated based on the full VDJ data
clone_counts_tcr <- createCloneCountTable(tcr_df)
clone_counts_bcr <- createCloneCountTable(bcr_df)
# We can identify expanded clonotypes in the CSF with a high CSF:PB ratio
clone_count_subset <- clone_counts_tcr[clone_counts_tcr$cell_count > 1 & clone_counts_tcr$CSF_count > 1, ]
b. create categories for clonal expansion
- Highly expanded - cell count > 5
- Moderate - 1 < cells count < 5
- Non-expanded - cell count = 1
tcr_df <- addExpansionCategories(tcr_df,clone_counts_tcr)
bcr_df <- addExpansionCategories(bcr_df,clone_counts_bcr)
c. add expansion categories to scRNA-Seq object
expansion_categ_col <- paste(colnames(rnaseq_obj),rnaseq_obj$main.celltype.annot)
uni_expansion_categ <- unique(tcr_df$EXPANSION_CATEGORY)
for (categ in unique(tcr_df$EXPANSION_CATEGORY)) {
cells_categ <- c()
cells_tcr <- tcr_df[tcr_df$EXPANSION_CATEGORY %in% categ,"unique_cell_id"]
cells_tcr <- unique(paste(cells_tcr,"T cells"))
cells_bcr <- bcr_df[bcr_df$EXPANSION_CATEGORY %in% categ,"unique_cell_id"]
cells_bcr <- unique(paste(cells_bcr,"B cells"))
if(length(cells_tcr)> 0) {cells_categ <- c(cells_categ, cells_tcr)}
if(length(cells_bcr)> 0) {cells_categ <- c(cells_categ, cells_bcr)}
expansion_categ_col[expansion_categ_col %in% cells_categ] <- categ
}
names(expansion_categ_col) <- colnames(rnaseq_obj)
rnaseq_obj$expansion_category <- expansion_categ_col
6. Look at proportions
a. subset B cells
# add column for Diagnosis + Subtype & Diagnois + Compartment
rnaseq_obj$status.subtype <- paste(rnaseq_obj$status,rnaseq_obj$sub.celltype.annot)
rnaseq_obj$status.compartment <- paste(rnaseq_obj$status,rnaseq_obj$compartment)
cells <- names(rnaseq_obj$main.celltype.annot[rnaseq_obj$main.celltype.annot %in% "B cells"])
bcell_obj <- rnaseq_obj[,cells]
cells <- names(rnaseq_obj$main.celltype.annot[rnaseq_obj$main.celltype.annot %in% "T cells"])
tcell_obj <- rnaseq_obj[,cells]
b. sample breakdown
- Disease Patient has an increased proportion of B cells in the CSF of
which most are memory B cells
p12 <- dittoBarPlot(rnaseq_obj, "sub.celltype.annot", group.by = "status.compartment", main = "All cells",color.panel = dittoColors()[1:6])
p13 <- dittoBarPlot(bcell_obj, "sub.celltype.annot", group.by = "status.compartment", main = "B cells",color.panel = dittoColors()[1:4])
plot_grid(p12,p13,nrow=2,align = "v")

c. clonotype expansion breakdown
- Expanded T cell populations are observed in all compartments, while
expanded B cells show more restriction
p14 <- dittoBarPlot(tcell_obj, "expansion_category", group.by = "status.compartment", main = "T cells")
p15 <- dittoBarPlot(bcell_obj, "expansion_category", group.by = "status.compartment", main = "B cells")
grid.arrange(p14,p15,nrow=2)

—————
SAVE Processed objects
save(rnaseq_obj,tcell_obj,bcell_obj,bcr_df,tcr_df,clone_counts_tcr,clone_counts_bcr,file = "objects/processed_objects.RData")